clustering analysis

安装量: 116
排名: #7375

安装

npx skills add https://github.com/aj-geddes/useful-ai-prompts --skill 'Clustering Analysis'
Clustering Analysis
Overview
Clustering partitions data into groups of similar observations without pre-defined labels, enabling discovery of natural patterns and structures in data.
When to Use
Segmenting customers based on purchasing behavior or demographics
Discovering natural groupings in data without prior knowledge of categories
Identifying market segments for targeted marketing campaigns
Organizing large datasets into meaningful categories for further analysis
Finding patterns in gene expression data or medical imaging
Grouping documents, products, or users by similarity for recommendation systems
Clustering Algorithms
K-Means
Partitioning into k clusters
Hierarchical
Dendrograms showing nested clusters
DBSCAN
Density-based arbitrary-shaped clusters
Gaussian Mixture
Probabilistic clustering
Agglomerative
Bottom-up hierarchical approach
Key Concepts
Cluster Validation
Metrics to evaluate cluster quality
Optimal Clusters
Methods to determine best k
Inertia
Within-cluster sum of squares
Silhouette Score
Measure of cluster separation
Dendrogram
Hierarchical clustering visualization Implementation with Python import pandas as pd import numpy as np import matplotlib . pyplot as plt from sklearn . cluster import KMeans , DBSCAN , AgglomerativeClustering from sklearn . mixture import GaussianMixture from sklearn . preprocessing import StandardScaler from sklearn . metrics import ( silhouette_score , silhouette_samples , davies_bouldin_score , calinski_harabasz_score ) from scipy . cluster . hierarchy import dendrogram , linkage import seaborn as sns

Generate sample data

np . random . seed ( 42 ) n_samples = 300 centers = [ [ 0 , 0 ] , [ 5 , 5 ] , [ - 3 , 4 ] ] X = np . vstack ( [ np . random . randn ( 100 , 2 ) + centers [ 0 ] , np . random . randn ( 100 , 2 ) + centers [ 1 ] , np . random . randn ( 100 , 2 ) + centers [ 2 ] , ] )

Standardize

scaler

StandardScaler ( ) X_scaled = scaler . fit_transform ( X )

K-Means with Elbow method

inertias

[ ] silhouette_scores = [ ] k_range = range ( 2 , 11 ) for k in k_range : kmeans = KMeans ( n_clusters = k , random_state = 42 , n_init = 10 ) kmeans . fit ( X_scaled ) inertias . append ( kmeans . inertia_ ) silhouette_scores . append ( silhouette_score ( X_scaled , kmeans . labels_ ) ) fig , axes = plt . subplots ( 1 , 2 , figsize = ( 14 , 4 ) ) axes [ 0 ] . plot ( k_range , inertias , 'bo-' ) axes [ 0 ] . set_xlabel ( 'Number of Clusters (k)' ) axes [ 0 ] . set_ylabel ( 'Inertia' ) axes [ 0 ] . set_title ( 'Elbow Method' ) axes [ 0 ] . grid ( True , alpha = 0.3 ) axes [ 1 ] . plot ( k_range , silhouette_scores , 'go-' ) axes [ 1 ] . set_xlabel ( 'Number of Clusters (k)' ) axes [ 1 ] . set_ylabel ( 'Silhouette Score' ) axes [ 1 ] . set_title ( 'Silhouette Analysis' ) axes [ 1 ] . grid ( True , alpha = 0.3 ) plt . tight_layout ( ) plt . show ( )

Optimal k = 3

optimal_k

3 kmeans = KMeans ( n_clusters = optimal_k , random_state = 42 , n_init = 10 ) kmeans_labels = kmeans . fit_predict ( X_scaled )

K-Means visualization

fig , axes = plt . subplots ( 1 , 3 , figsize = ( 15 , 4 ) )

K-Means clusters

axes [ 0 ] . scatter ( X [ : , 0 ] , X [ : , 1 ] , c = kmeans_labels , cmap = 'viridis' , alpha = 0.6 ) axes [ 0 ] . scatter ( kmeans . cluster_centers_ [ : , 0 ] , kmeans . cluster_centers_ [ : , 1 ] , c = 'red' , marker = 'X' , s = 200 , edgecolors = 'black' , linewidths = 2 ) axes [ 0 ] . set_title ( f'K-Means (k= { optimal_k } )' ) axes [ 0 ] . set_xlabel ( 'Feature 1' ) axes [ 0 ] . set_ylabel ( 'Feature 2' )

Silhouette plot

ax

axes [ 1 ] y_lower = 10 silhouette_vals = silhouette_samples ( X_scaled , kmeans_labels ) for i in range ( optimal_k ) : cluster_silhouette_vals = silhouette_vals [ kmeans_labels == i ] cluster_silhouette_vals . sort ( ) size_cluster_i = cluster_silhouette_vals . shape [ 0 ] y_upper = y_lower + size_cluster_i ax . fill_betweenx ( np . arange ( y_lower , y_upper ) , 0 , cluster_silhouette_vals , alpha = 0.7 , label = f'Cluster { i } ' ) y_lower = y_upper + 10 ax . axvline ( x = silhouette_score ( X_scaled , kmeans_labels ) , color = "red" , linestyle = "--" ) ax . set_xlabel ( 'Silhouette Coefficient' ) ax . set_ylabel ( 'Cluster Label' ) ax . set_title ( 'Silhouette Plot' )

Hierarchical clustering

linkage_matrix

linkage ( X_scaled , method = 'ward' ) dendrogram ( linkage_matrix , ax = axes [ 2 ] , truncate_mode = 'lastp' , p = 10 ) axes [ 2 ] . set_title ( 'Dendrogram (Ward)' ) axes [ 2 ] . set_xlabel ( 'Sample Index' ) plt . tight_layout ( ) plt . show ( )

Hierarchical clustering

hierarchical

AgglomerativeClustering ( n_clusters = optimal_k , linkage = 'ward' ) hier_labels = hierarchical . fit_predict ( X_scaled )

DBSCAN clustering

dbscan

DBSCAN ( eps = 0.4 , min_samples = 5 ) dbscan_labels = dbscan . fit_predict ( X_scaled ) n_clusters_dbscan = len ( set ( dbscan_labels ) ) - ( 1 if - 1 in dbscan_labels else 0 ) n_noise = list ( dbscan_labels ) . count ( - 1 )

Gaussian Mixture Model

gmm

GaussianMixture ( n_components = optimal_k , random_state = 42 ) gmm_labels = gmm . fit_predict ( X_scaled ) gmm_proba = gmm . predict_proba ( X_scaled )

Clustering algorithm comparison

fig , axes = plt . subplots ( 2 , 2 , figsize = ( 12 , 10 ) ) algorithms = [ ( kmeans_labels , 'K-Means' ) , ( hier_labels , 'Hierarchical' ) , ( dbscan_labels , 'DBSCAN' ) , ( gmm_labels , 'Gaussian Mixture' ) , ] for idx , ( labels , title ) in enumerate ( algorithms ) : ax = axes [ idx // 2 , idx % 2 ]

Skip noise points for DBSCAN

mask

labels != - 1 scatter = ax . scatter ( X [ mask , 0 ] , X [ mask , 1 ] , c = labels [ mask ] , cmap = 'viridis' , alpha = 0.6 ) if title == 'DBSCAN' and n_noise

0 : noise_mask = labels == - 1 ax . scatter ( X [ noise_mask , 0 ] , X [ noise_mask , 1 ] , c = 'red' , marker = 'x' , s = 100 , label = 'Noise' ) ax . legend ( ) ax . set_title ( f' { title } (n_clusters= { len ( set ( labels [ mask ] ) ) } )' ) ax . set_xlabel ( 'Feature 1' ) ax . set_ylabel ( 'Feature 2' ) plt . tight_layout ( ) plt . show ( )

Cluster validation metrics

validation_metrics

{ 'Algorithm' : [ 'K-Means' , 'Hierarchical' , 'DBSCAN' , 'GMM' ] , 'Silhouette Score' : [ silhouette_score ( X_scaled , kmeans_labels ) , silhouette_score ( X_scaled , hier_labels ) , silhouette_score ( X_scaled [ dbscan_labels != - 1 ] , dbscan_labels [ dbscan_labels != - 1 ] ) if n_noise < len ( X_scaled ) else np . nan , silhouette_score ( X_scaled , gmm_labels ) , ] , 'Davies-Bouldin Index' : [ davies_bouldin_score ( X_scaled , kmeans_labels ) , davies_bouldin_score ( X_scaled , hier_labels ) , davies_bouldin_score ( X_scaled [ dbscan_labels != - 1 ] , dbscan_labels [ dbscan_labels != - 1 ] ) if n_noise < len ( X_scaled ) else np . nan , davies_bouldin_score ( X_scaled , gmm_labels ) , ] , 'Calinski-Harabasz Index' : [ calinski_harabasz_score ( X_scaled , kmeans_labels ) , calinski_harabasz_score ( X_scaled , hier_labels ) , calinski_harabasz_score ( X_scaled [ dbscan_labels != - 1 ] , dbscan_labels [ dbscan_labels != - 1 ] ) if n_noise < len ( X_scaled ) else np . nan , calinski_harabasz_score ( X_scaled , gmm_labels ) , ] , } metrics_df = pd . DataFrame ( validation_metrics ) print ( "Clustering Validation Metrics:" ) print ( metrics_df )

Cluster size analysis

sizes_df

pd . DataFrame ( { 'K-Means' : pd . Series ( kmeans_labels ) . value_counts ( ) . sort_index ( ) , 'Hierarchical' : pd . Series ( hier_labels ) . value_counts ( ) . sort_index ( ) , 'GMM' : pd . Series ( gmm_labels ) . value_counts ( ) . sort_index ( ) , } ) print ( "\nCluster Sizes:" ) print ( sizes_df )

Membership probability (GMM)

fig , ax = plt . subplots ( figsize = ( 10 , 6 ) ) membership = gmm_proba . max ( axis = 1 ) scatter = ax . scatter ( X [ : , 0 ] , X [ : , 1 ] , c = membership , cmap = 'RdYlGn' , alpha = 0.6 , s = 50 ) ax . set_title ( 'Cluster Membership Confidence (GMM)' ) ax . set_xlabel ( 'Feature 1' ) ax . set_ylabel ( 'Feature 2' ) plt . colorbar ( scatter , ax = ax , label = 'Membership Probability' ) plt . show ( )

Cluster characteristics

kmeans_centers_original

scaler
.
inverse_transform
(
kmeans
.
cluster_centers_
)
cluster_df
=
pd
.
DataFrame
(
X
,
columns
=
[
'Feature 1'
,
'Feature 2'
]
)
cluster_df
[
'Cluster'
]
=
kmeans_labels
for
cluster_id
in
range
(
optimal_k
)
:
cluster_data
=
cluster_df
[
cluster_df
[
'Cluster'
]
==
cluster_id
]
print
(
f"\nCluster
{
cluster_id
}
Characteristics:"
)
print
(
cluster_data
[
[
'Feature 1'
,
'Feature 2'
]
]
.
describe
(
)
)
Cluster Quality Metrics
Silhouette Score
-1 to 1 (higher is better)
Davies-Bouldin Index
Lower is better
Calinski-Harabasz Index
Higher is better
Inertia
Lower is better (KMeans only)
Algorithm Selection
K-Means
Fast, spherical clusters, k needs specification
Hierarchical
Produces dendrogram, interpretable
DBSCAN
Arbitrary shapes, handles noise
GMM
Probabilistic, soft assignments Deliverables Optimal cluster count analysis Cluster visualizations Validation metrics comparison Cluster characteristics summary Silhouette plots Dendrogram for hierarchical clustering Membership assignments
返回排行榜